library(data.table)
library(dplyr)
library(DT)
library(ggplot2)
library(gtools)
library(vegan)
library(iNEXT)
library(fossil)
library(ggrepel)

1) 18S amplicons

1.1) Data overview

Let’s read the dataset and remove the samples containing less than 1154 reads:

## [1] 8642   58
##   OTU_no TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## 1  OTU_1                  0                  0                  0
## 2  OTU_2                  0                  0                  0
## 3  OTU_3                  0                  0                  0
## 4  OTU_4                  0                  0                  0
## 5  OTU_5                  0                  0                  0
##   TARA_110_SUR_0d2_3
## 1                  0
## 2                  0
## 3                  0
## 4                  0
## 5                  0
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = "NA"): invalid factor
## level, NA generated
## [1] 8642   57
##       TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## OTU_1                  0                  0                  0
## OTU_2                  0                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  0                  0
## OTU_5                  0                  0                  0
##       TARA_110_SUR_0d2_3 TARA_111_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  0                  1
## OTU_4                  0                  0
## OTU_5                  0                  0
## [1] 8642    2
##                                                                                                                                              OTUId
## OTU_1                            SILVA.v123_KJ760253.1.1809_Eukaryota;SAR;Rhizaria;Retaria;Polycystinea;Spumellaria;Actinomma;uncultured_eukaryote
## OTU_2                            SILVA.v123_AF018162.1.1798_Eukaryota;SAR;Rhizaria;Retaria;Polycystinea;Collodaria;Collozoum;Collozoum_serpentinum
## OTU_3         SILVA.v123_KJ759918.1.2019_Eukaryota;Excavata;Discoba;Discicristata;Euglenozoa;Diplonemea;SCM38C39_marine_group;uncultured_eukaryote
## OTU_4 SILVA.v123_FN598252.1.1343_Eukaryota;Excavata;Discoba;Discicristata;Euglenozoa;Diplonemea;SCM38C39_marine_group;uncultured_marine_diplonemid
## OTU_5                       SILVA.v123_KJ760105.1.1803_Eukaryota;SAR;Alveolata;Protalveolata;Syndiniales;Syndiniales_Group_II;uncultured_eukaryote
##       classif
## OTU_1    <NA>
## OTU_2    <NA>
## OTU_3    <NA>
## OTU_4    <NA>
## OTU_5    <NA>
## [1] 8642   55
##       TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## OTU_1                  0                  0                  0
## OTU_2                  0                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  0                  0
## OTU_5                  0                  0                  0
##       TARA_110_SUR_0d2_3 TARA_111_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  0                  1
## OTU_4                  0                  0
## OTU_5                  0                  0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      93    1840    3086    4440    5808   23780
## TARA_100_SUR_0d2_3 TARA_141_SUR_0d2_3  TARA_93_SUR_0d2_3 
##                949                881                 93 
##  TARA_94_SUR_0d2_3             MP0528             MP0778 
##                638                176                805 
##             MP1176             MP1421             MP1857 
##                663                941                498 
##             MP2243 
##                509
## [1] 8642   45
## [1] 8642   43
## [1] 8642    0

Table dimensions and content outline:

## [1] 8642   43
##       TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_1                  0                  0                  0
## OTU_2                  0                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  0                  0
## OTU_5                  0                  0                  0
##       TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  1                  0
## OTU_4                  0                  0
## OTU_5                  0                  0

Minimum number of reads per station:

min(colSums(tb18_tax_occur_min1154)) 
## [1] 1154

Maximum number of reads per station:

max(colSums(tb18_tax_occur_min1154)) 
## [1] 23781

Identification of stations with higher number of reads:

amplicons_per_sample<-colSums(tb18_tax_occur_min1154)
amplicons_per_sample[which(colSums(tb18_tax_occur_min1154)>20000)]
## TARA_84_SUR_0d2_3 
##             23781

Overall reads per sample:

1.2) Normalization

Let’s normalize the original dataset by randomly subsampling 1154 reads in each station:

tb18_tax_occur_min1154_t<-t(tb18_tax_occur_min1154)
tb18_tax_occur_ss1154<-rrarefy(tb18_tax_occur_min1154_t, 1154)

The normalized table shows the following dimensions and format:

## [1]   43 8642
##                    OTU_1 OTU_2 OTU_3 OTU_4 OTU_5
## TARA_102_SUR_0d2_3     0     0     0     0     0
## TARA_109_SUR_0d2_3     0     0     0     0     0
## TARA_110_SUR_0d2_3     0     0     0     0     0
## TARA_111_SUR_0d2_3     0     0     1     0     0
## TARA_112_SUR_0d2_3     0     0     0     0     0

Its content fits with the expected normalization values (1154 reads per station):

rowSums(tb18_tax_occur_ss1154)
## TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3 
##               1154               1154               1154 
## TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3 TARA_122_SUR_0d2_3 
##               1154               1154               1154 
## TARA_123_SUR_0d2_3 TARA_124_SUR_0d2_3 TARA_125_SUR_0d2_3 
##               1154               1154               1154 
## TARA_128_SUR_0d2_3 TARA_132_SUR_0d2_3 TARA_133_SUR_0d2_3 
##               1154               1154               1154 
## TARA_137_SUR_0d2_3 TARA_138_SUR_0d2_3 TARA_140_SUR_0d2_3 
##               1154               1154               1154 
## TARA_142_SUR_0d2_3 TARA_145_SUR_0d2_3 TARA_146_SUR_0d2_3 
##               1154               1154               1154 
## TARA_148_SUR_0d2_3 TARA_149_SUR_0d2_3 TARA_150_SUR_0d2_3 
##               1154               1154               1154 
## TARA_151_SUR_0d2_3 TARA_152_SUR_0d2_3  TARA_56_SUR_0d2_3 
##               1154               1154               1154 
##  TARA_57_SUR_0d2_3  TARA_62_SUR_0d2_3  TARA_64_SUR_0d2_3 
##               1154               1154               1154 
##  TARA_65_SUR_0d2_3  TARA_66_SUR_0d2_3  TARA_68_SUR_0d2_3 
##               1154               1154               1154 
##  TARA_72_SUR_0d2_3  TARA_76_SUR_0d2_3  TARA_78_SUR_0d2_3 
##               1154               1154               1154 
##  TARA_82_SUR_0d2_3  TARA_84_SUR_0d2_3  TARA_85_SUR_0d2_3 
##               1154               1154               1154 
##  TARA_96_SUR_0d2_3  TARA_98_SUR_0d2_3  TARA_99_SUR_0d2_3 
##               1154               1154               1154 
##             MP0311             MP1517             MP1672 
##               1154               1154               1154 
##             MP2821 
##               1154

Let’s check out how many OTUs don’t appear in the new table:

length(which(colSums(tb18_tax_occur_ss1154)==0)) #2923
## [1] 2914

There are 2923 OTUs that don’t show any occurrence in the normalized data. Let’s remove them from the table and take a look at its final dimensions:

tb18_tax_occur_ss1154_no_cero<-tb18_tax_occur_ss1154[,-(which(colSums(tb18_tax_occur_ss1154)==0))]
tb18_tax_occur_ss1154_no_cero<-tb18_tax_occur_ss1154_no_cero[mixedorder(row.names(tb18_tax_occur_ss1154_no_cero)),]
dim(tb18_tax_occur_ss1154_no_cero) #43 5719
## [1]   43 5728

Datasets summary:

dim(tb18_tax) #8642   57
## [1] 8642   57
dim(tb18_tax_occur) #8642   55
## [1] 8642   55
dim(tb18_tax_occur_ss1154_no_cero) #43 5719
## [1]   43 5728

1.3) General community analysis

1.3.1) Richness and evenness (Shannon index)

Most of the samples take Shannon Index values around 6:

1.3.2) Richness: OTU number

Lowest number of OTUs per sample:

## [1] 356

Maximum number of OTUs per sample:

## [1] 800

In most of the samples, we can identify between 600 and 700 OTUs:

plot(sort(OTUs_per_sample_18S_tax_occur_ss1154), pch=19)

boxplot(OTUs_per_sample_18S_tax_occur_ss1154, pch=19)

1.3.3) Index of evenness

1.3.3.1) Pielou’s index

pielou_evenness_18S_tax_occur_ss1154<-tb18_tax_occur_ss1154_div/log(OTUs_per_sample_18S_tax_occur_ss1154)

The Pielou index (constrained between 0 and 1) takes values closer to 1 as the variation of species proportion in a sample increases. Most of the samples get values around 0.95, meaning that the numerical composition of different OTUs in a sample is highly variable - there’s no constant presence of dominant species.

The less variation in communities between the species (and the presence of a dominant specie), the lower J’ is.

plot(sort(pielou_evenness_18S_tax_occur_ss1154), pch=19)

boxplot(pielou_evenness_18S_tax_occur_ss1154, pch=19)

The OTU_38, with 890 reads, is the most abundant in the overall dataset:

head(sort(colSums(tb18_tax_occur_ss1154_no_cero), decreasing=T), n=10L)
##   OTU_38 OTU_3677  OTU_751 OTU_5415 OTU_3374 OTU_2494 OTU_2631 OTU_3177 
##      868      689      545      534      438      344      303      282 
## OTU_1032 OTU_3569 
##      263      256

Most of the OTUs show very few occurrences; suggesting that we will probably be able to identify a significant ammount of rare otus:

plot(log(sort(colSums(tb18_tax_occur_ss1154_no_cero), decreasing=T)), pch=19)

1.3.4) Abundance Models

1.3.4.1) Rank-Abundance or Dominance/Diversity Model (“radfit”)

The OTUs abundance distribution fits relativelly close to log-normal model.

1.3.4.2) Preston’s Lognormal Model

According to Preston’s lognormal model fit into species frequencies groups, we’re missing 1588.816 species:

tb18_tax_occur_ss1154_prestonfit<-prestonfit(colSums(tb18_tax_occur_min1154_t))
plot(tb18_tax_occur_ss1154_prestonfit, main="Pooled species")

veiledspec(tb18_tax_occur_ss1154_prestonfit)
## Extrapolated     Observed       Veiled 
##     9937.816     8349.000     1588.816

When computing Prestons’ lognormal model fit without pooling data into groups, we seem to miss 1385.371 species:

## Extrapolated     Observed       Veiled 
##     9734.371     8349.000     1385.371

1.3.5) Rarefaction curves of rarefied and non-rarefied datasets

rarec_input<-t(as.matrix(colSums(tb18_tax_occur_ss1154_no_cero)))

#tb18_rarecurve_step1000_10000<-rarecurve(rarec_input, step = 1000, 10000, xlab = "Sample size", ylab = "OTUs", label = TRUE, main="18S amplicons diversity step=1000 & ss=10000\n(40,816 OTUs; 5,247,375 reads)\n")
#tb18_rarecurve_step1000_10000

tb18_rarecurve_step100_5700<-rarecurve(rarec_input, step = 100, 5700, xlab = "Sample size", ylab = "OTUs", label = TRUE, main="18S amplicons diversity step=100 & ss=5700\n(5,719 OTUs; 49,622 reads)\n")

#without rarefying
rarec_allOTUs_input<-t(as.matrix(colSums(t(tb18_tax_occur))))
tb18_rarecurve_allOTUs_step100_8642<-rarecurve(rarec_allOTUs_input, step = 100, 8642, xlab = "Sample size", ylab = "OTUs", label = TRUE, main="18S amplicons diversity non-rarefied step=1000 & ss=100000\n(8,642 OTUs; 244,184 reads)\n")

1.3.6) Beta diversity

1.3.6.1) Dissimilarity matrix using Bray-Curtis index:

The Bray-Curtis dissimilarity, constrained between 0 (minimum distance) and 1 (highest dissimilarity) allows us to quantify the differences between samples according to the composition and relative abundance of their OTUs. In our dataset, most of the samples pairs take dissimilarity values around 0.8, meaning that their composition is substantially different.

1.3.6.2) Hierarchical clustering

The stations seem to form clusters according to geographic localization, but there are no evident clusters separated from the general groups, apart from the one containing TARA_84, TARA_82 and TARA_85.

(To be done: assign Longhurst provinces information to each station and check if any of the central clusters is meaningful regarding to the samples’ geographical location)

1.3.6.3) Non-metric multidimensional scaling

We can identify a prominent group in the central part of the NMDS plot and a few outliers (TARA 82, 84 and 85) in the central-right edge of the plot. The stress parameter takes a value below 0.2, suggesting that the plot is acceptable.

## 
## Call:
## monoMDS(dist = tb18_tax_occur_ss1154_no_cero.bray) 
## 
## Non-metric Multidimensional Scaling
## 
## 43 points, dissimilarity 'bray', call 'vegdist(x = tb18_tax_occur_ss1154_no_cero, method = "bray")'
## 
## Dimensions: 2 
## Stress:     0.1373119 
## Stress type 1, weak ties
## Scores scaled to unit root mean square, rotated to principal components
## Stopped after 81 iterations: Stress nearly unchanged (ratio > sratmax)

When implementing a most robut function for computing NMDS plots, the result is quiet the same:

## Run 0 stress 0.1393899 
## Run 1 stress 0.1393083 
## ... New best solution
## ... Procrustes: rmse 0.005260058  max resid 0.03069048 
## Run 2 stress 0.1373122 
## ... New best solution
## ... Procrustes: rmse 0.02523206  max resid 0.1569697 
## Run 3 stress 0.1372899 
## ... New best solution
## ... Procrustes: rmse 0.002165978  max resid 0.009552862 
## ... Similar to previous best
## Run 4 stress 0.1393081 
## Run 5 stress 0.1373685 
## ... Procrustes: rmse 0.003151796  max resid 0.01736886 
## Run 6 stress 0.1372891 
## ... New best solution
## ... Procrustes: rmse 0.0002659927  max resid 0.001106764 
## ... Similar to previous best
## Run 7 stress 0.1373125 
## ... Procrustes: rmse 0.002111158  max resid 0.009669421 
## ... Similar to previous best
## Run 8 stress 0.1393895 
## Run 9 stress 0.1372898 
## ... Procrustes: rmse 0.0002476806  max resid 0.0009210563 
## ... Similar to previous best
## Run 10 stress 0.1373131 
## ... Procrustes: rmse 0.002151691  max resid 0.009615026 
## ... Similar to previous best
## Run 11 stress 0.1393957 
## Run 12 stress 0.1393893 
## Run 13 stress 0.1393895 
## Run 14 stress 0.1393095 
## Run 15 stress 0.1372895 
## ... Procrustes: rmse 0.0004200128  max resid 0.00178044 
## ... Similar to previous best
## Run 16 stress 0.1372904 
## ... Procrustes: rmse 0.000361044  max resid 0.0014713 
## ... Similar to previous best
## Run 17 stress 0.1372891 
## ... New best solution
## ... Procrustes: rmse 0.0002188077  max resid 0.0008048182 
## ... Similar to previous best
## Run 18 stress 0.1393906 
## Run 19 stress 0.13939 
## Run 20 stress 0.1393944 
## *** Solution reached
## Warning in ordiplot(x, choices = choices, type = type, display = display, :
## Species scores not available

1.4) Geographical analysis

## Warning in if (class(lats) == "SpatialPoints") lats <- coordinates(lats):
## the condition has length > 1 and only the first element will be used

Working datasets:

  1. Community matrix: tb18_tax_occur_ss1154_no_cero
dim(tb18_tax_occur_ss1154_no_cero)
## [1]   43 5728
tb18_tax_occur_ss1154_no_cero[1:5, 1:5]
##                    OTU_1 OTU_2 OTU_3 OTU_5 OTU_6
## MP0311                 0     0     0     0     0
## MP1517                 0     0     0     0     1
## MP1672                 0     0     0     0     0
## MP2821                 0     3     0     0     0
## TARA_102_SUR_0d2_3     0     0     0     0     0
  1. Community Bray-Curtis: tb18_tax_occur_ss1154_no_cero.bray
tb18_tax_occur_ss1154_no_cero.bray<-as.matrix(tb18_tax_occur_ss1154_no_cero.bray)
## [1] 43 43
  1. Stations distances in km: geo_distances_MP_18S
dim(geo_distances_MP_18S)
## [1] 43 43

Communities quickly change their composition across geographical distances:

plot(geo_distances_MP_18S, tb18_tax_occur_ss1154_no_cero.bray, pch=19, cex=0.4, xlab="Geopgraphical distances", ylab="Bray-Curtis dissimilarities")

1.4.1) Mantel correlograms

Mantel statistic is -significantlly- so low, meaning that the correlation between samples dissimilarity and geographical distances is weak.

mantel(geo_distances_MP_18S, tb18_tax_occur_ss1154_no_cero.bray)
## 
## Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel(xdis = geo_distances_MP_18S, ydis = tb18_tax_occur_ss1154_no_cero.bray) 
## 
## Mantel statistic r: 0.1186 
##       Significance: 0.018 
## 
## Upper quantiles of permutations (null model):
##    90%    95%  97.5%    99% 
## 0.0646 0.0894 0.1075 0.1334 
## Permutation: free
## Number of permutations: 999

Maximum distance between samples:

## [1] 19543.94

Minimum distance between samples:

## [1] 0

Correlograms:

MP_18s_ss1154_mantel_correl_by_1000km<-mantel.correlog(tb18_tax_occur_ss1154_no_cero.bray, D.geo=geo_distances_MP_18S, break.pts=seq(0,20000, by=1000))
plot(MP_18s_ss1154_mantel_correl_by_1000km)

MP_18s_ss1154_mantel_correl_by_100km<-mantel.correlog(tb18_tax_occur_ss1154_no_cero.bray, D.geo=geo_distances_MP_18S, break.pts=seq(0,20000, by=100))
plot(MP_18s_ss1154_mantel_correl_by_100km)

1.5) Abundance vs. occurence

In the following plot, we can appreciate the OTUs distribution according to their percentage of occurence and relative abundance. The red line keeps up OTUs that occur in more than 80% of the samples, the green line limits regionally rare OTUs (< 0.001%), and the blue one restricts regionally abundant OTUs (> 0.1%).

Regionally abundant OTUs (relative abundance over 0.1%):

##          mean_rabund perc_occur
## OTU_38   0.017492241  88.372093
## OTU_3677 0.013884970  81.395349
## OTU_751  0.010983032  37.209302
## OTU_5415 0.010761356  25.581395
## OTU_3374 0.008826730  93.023256
## OTU_2494 0.006932409  69.767442
## OTU_2631 0.006106163  74.418605
## OTU_3177 0.005682963  48.837209
## OTU_1032 0.005300069  72.093023
## OTU_3569 0.005159002  53.488372
## OTU_933  0.004796260  90.697674
## OTU_3282 0.004211842  37.209302
## OTU_2910 0.004070775  72.093023
## OTU_5120 0.004070775  39.534884
## OTU_3729 0.003909556  79.069767
## OTU_2200 0.003728185  86.046512
## OTU_1921 0.003687880  76.744186
## OTU_1383 0.003627423  88.372093
## OTU_3454 0.003607271  72.093023
## OTU_3227 0.003466205  34.883721
## OTU_1597 0.003385595  69.767442
## OTU_4176 0.003385595  34.883721
## OTU_4144 0.003244529  44.186047
## OTU_4227 0.003204224  32.558140
## OTU_1456 0.002901939  72.093023
## OTU_1833 0.002720567  72.093023
## OTU_974  0.002680263  67.441860
## OTU_4125 0.002599653  79.069767
## OTU_3038 0.002539196  76.744186
## OTU_4123 0.002519044  34.883721
## OTU_4103 0.002418282  37.209302
## OTU_2031 0.002377978  67.441860
## OTU_4518 0.002377978  44.186047
## OTU_3271 0.002357825  32.558140
## OTU_950  0.002317520   4.651163
## OTU_896  0.002297368  74.418605
## OTU_380  0.002156302  83.720930
## OTU_3871 0.002156302  69.767442
## OTU_2654 0.002115997  65.116279
## OTU_2996 0.002055540  69.767442
## OTU_949  0.002055540  65.116279
## OTU_3926 0.002015235  69.767442
## OTU_893  0.002015235  55.813953
## OTU_3560 0.001974930  62.790698
## OTU_2553 0.001974930  83.720930
## OTU_3056 0.001934626  72.093023
## OTU_973  0.001934626  67.441860
## OTU_1018 0.001874169  13.953488
## OTU_758  0.001874169  65.116279
## OTU_717  0.001833864  76.744186
## OTU_1047 0.001813712  67.441860
## OTU_1746 0.001793559  53.488372
## OTU_4260 0.001793559  41.860465
## OTU_256  0.001773407  76.744186
## OTU_2269 0.001753255  86.046512
## OTU_2195 0.001733102  72.093023
## OTU_2965 0.001733102  67.441860
## OTU_4134 0.001733102  48.837209
## OTU_2648 0.001712950  83.720930
## OTU_1343 0.001712950  69.767442
## OTU_2868 0.001712950  48.837209
## OTU_4226 0.001672645  37.209302
## OTU_2686 0.001632340  72.093023
## OTU_2043 0.001632340  60.465116
## OTU_1641 0.001612188  74.418605
## OTU_2305 0.001612188  74.418605
## OTU_3018 0.001612188  72.093023
## OTU_1496 0.001551731  69.767442
## OTU_2518 0.001551731  53.488372
## OTU_4533 0.001551731  39.534884
## OTU_4090 0.001551731  30.232558
## OTU_216  0.001531579  67.441860
## OTU_1997 0.001471122  74.418605
## OTU_2207 0.001450969  72.093023
## OTU_1635 0.001450969  69.767442
## OTU_3124 0.001450969  58.139535
## OTU_437  0.001450969  76.744186
## OTU_793  0.001430817  60.465116
## OTU_4089 0.001430817  37.209302
## OTU_3602 0.001410665  65.116279
## OTU_884  0.001390512  76.744186
## OTU_2120 0.001390512  58.139535
## OTU_3430 0.001390512  34.883721
## OTU_2853 0.001350208  69.767442
## OTU_2887 0.001350208  69.767442
## OTU_1551 0.001350208  65.116279
## OTU_4203 0.001350208  53.488372
## OTU_1066 0.001330055  58.139535
## OTU_2189 0.001330055  58.139535
## OTU_1929 0.001330055  48.837209
## OTU_1395 0.001330055  25.581395
## OTU_2594 0.001309903  69.767442
## OTU_1087 0.001289751  55.813953
## OTU_1446 0.001289751  69.767442
## OTU_1694 0.001289751  55.813953
## OTU_2730 0.001289751  53.488372
## OTU_983  0.001269598  55.813953
## OTU_2986 0.001249446  60.465116
## OTU_4190 0.001249446  30.232558
## OTU_2124 0.001229293  72.093023
## OTU_1628 0.001229293  58.139535
## OTU_628  0.001229293  58.139535
## OTU_922  0.001229293  58.139535
## OTU_6181 0.001229293  23.255814
## OTU_2078 0.001209141  67.441860
## OTU_938  0.001188989  67.441860
## OTU_990  0.001188989  67.441860
## OTU_1001 0.001188989  60.465116
## OTU_5484 0.001188989  34.883721
## OTU_2158 0.001168836  67.441860
## OTU_3196 0.001168836  58.139535
## OTU_2873 0.001148684  72.093023
## OTU_825  0.001148684  44.186047
## OTU_1261 0.001128532  58.139535
## OTU_2456 0.001128532  58.139535
## OTU_2966 0.001128532  58.139535
## OTU_914  0.001128532  58.139535
## OTU_458  0.001108379  60.465116
## OTU_1825 0.001108379  55.813953
## OTU_2089 0.001108379  46.511628
## OTU_5750 0.001108379  25.581395
## OTU_6608 0.001108379  18.604651
## OTU_2086 0.001068075  65.116279
## OTU_1911 0.001068075  48.837209
## OTU_599  0.001068075  48.837209
## OTU_1901 0.001047922  48.837209
## OTU_2219 0.001027770  67.441860
## OTU_1003 0.001027770  62.790698
## OTU_1975 0.001027770  58.139535
## OTU_295  0.001027770  58.139535
## OTU_2906 0.001027770  48.837209
## OTU_4110 0.001027770  41.860465
## OTU_373  0.001007618  51.162791
## OTU_1276 0.001007618  48.837209
## OTU_2466 0.001007618  46.511628
## OTU_3813 0.001007618  39.534884
## OTU_1012 0.001007618  60.465116
## OTU_363  0.001007618  44.186047
dim(tb18_ss1154_abundant_sorted)
## [1] 138   2

Proportion of regionally abundant OTUs (%):

## [1] 2.409218

Cosmopolitan OTUs (relative abundance over 0.1% and occurence in more than 80% of samples):

##          mean_rabund perc_occur
## OTU_3374 0.008826730   93.02326
## OTU_933  0.004796260   90.69767
## OTU_38   0.017492241   88.37209
## OTU_1383 0.003627423   88.37209
## OTU_2200 0.003728185   86.04651
## OTU_2269 0.001753255   86.04651
## OTU_380  0.002156302   83.72093
## OTU_2553 0.001974930   83.72093
## OTU_2648 0.001712950   83.72093
## OTU_3677 0.013884970   81.39535
dim(otu_tb18_ss1154_cosmop_sorted)
## [1] 10  2

Number and proportion (%) of cosmopolitan OTUs:

## [1] 10
## [1] 0.174581

Number and proportion (%) of rare OTUs:

nrow(otu_tb18_ss1154_rabund_percoccur[otu_tb18_ss1154_rabund_percoccur$mean_rabund < 0.00001 & otu_tb18_ss1154_rabund_percoccur$mean_rabund >0,])
## [1] 0
## [1] 0

1.6) Taxonomic composition analysis

1.6.1) Normalized data

1.6.1.1) Absolute values

Let’s add the taxonomic classification by merging “tb18_tax_occur_ss1154_no_cero” with “tb18_tax”:

## [1] 5728   46
##   Row.names MP0311 MP1517 MP1672 MP2821
## 1     OTU_1      0      0      0      0
## 2    OTU_10      0      1      3      0
## 3   OTU_100      0      0      0      0
## 4  OTU_1000      2      1      0      0
## 5  OTU_1001      0      2      0      5
## [1] 5728   45
##          MP0311 MP1517 MP1672 MP2821 TARA_102_SUR_0d2_3
## OTU_1         0      0      0      0                  0
## OTU_10        0      1      3      0                  0
## OTU_100       0      0      0      0                  0
## OTU_1000      2      1      0      0                  0
## OTU_1001      0      2      0      5                  0
## [1] 5728   46
##       MP0311 MP1517 MP1672 MP2821 TARA_102_SUR_0d2_3
## OTU_1      0      0      0      0                  0
## OTU_2      0      0      0      3                  0
## OTU_3      0      0      0      0                  0
## OTU_5      0      0      0      0                  0
## OTU_6      0      1      0      0                  0
## [1] 1905   46
##        MP0311 MP1517 MP1672 MP2821 TARA_102_SUR_0d2_3
## OTU_14      0      0      0      0                  0
## OTU_19      0      0      0      0                  0
## OTU_29      0      0      0      0                  0
## OTU_72      0      0      0      0                  0
## OTU_74      0      0      0      0                  0
##        MP0311 MP1517 MP1672 MP2821 TARA_102_SUR_0d2_3
## OTU_14      0      0      0      0                  0
## OTU_19      0      0      0      0                  0
## OTU_29      0      0      0      0                  0
## OTU_72      0      0      0      0                  0
## OTU_74      0      0      0      0                  0
## [1] 1044   43
## [1] 43
## [1] 32
## [1] 41
## [1] 43
## [1] 43
## [1] 27
## [1] 39
## [1] 34
## [1] 35
## [1] 0
## [1] 43
## [1] 39
## [1] 35
## [1] 18
## [1] 2
## [1] 0
## [1] 14
## [1] 2
## [1] 0
## [1] 5
## [1] 0
## [1] 0
## [1] 12
## [1] 25
## [1] 4
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 36
## [1] 16
## [1] 21
##                     Group.1    x
## 8               Dinophyceae 5453
## 17         Prymnesiophyceae 5180
## 10          Mamiellophyceae 3173
## 13            Pelagophyceae 2206
## 16 Prasinophyceae_clade-VII 1367
## 7          Dictyochophyceae  622
## 5             Chrysophyceae  599
## 6             Cryptophyceae  380
## 1         Bacillariophyceae  287
## 3      Chlorarachniophyceae  136
## 2             Bolidophyceae  132
## 18         Pyramimonadaceae  111
## 23     other_Prasinophyceae   93
## 9         Eustigmatophyceae   88
## 15  Prasinophyceae_clade-IX   53
## 12           Pavlovophyceae   51
## 20         Trebouxiophyceae   39
## 4             Chlorophyceae   25
## 11     Nephroselmidophyceae   19
## 22            Xanthophyceae    5
## 19             Rhodophyceae    4
## 21              Ulvophyceae    4
## 14             Phaeophyceae    2
##                     Group.1    x
## 8               Dinophyceae 1044
## 17         Prymnesiophyceae  165
## 5             Chrysophyceae  134
## 1         Bacillariophyceae  125
## 10          Mamiellophyceae  102
## 6             Cryptophyceae   57
## 7          Dictyochophyceae   48
## 16 Prasinophyceae_clade-VII   30
## 20         Trebouxiophyceae   30
## 13            Pelagophyceae   23
## 2             Bolidophyceae   21
## 4             Chlorophyceae   20
## 9         Eustigmatophyceae   19
## 23     other_Prasinophyceae   19
## 3      Chlorarachniophyceae   16
## 18         Pyramimonadaceae   14
## 15  Prasinophyceae_clade-IX   13
## 11     Nephroselmidophyceae    7
## 12           Pavlovophyceae    5
## 19             Rhodophyceae    4
## 22            Xanthophyceae    4
## 21              Ulvophyceae    3
## 14             Phaeophyceae    2
##                      reads_per_class OTUs_per_class
## Bacillariophyceae                287            125
## Bolidophyceae                    132             21
## Chlorarachniophyceae             136             16
## Chlorophyceae                     25             20
## Chrysophyceae                    599            134
##                          reads_per_class OTUs_per_class samples_per_class
## Dinophyceae                         5453           1044                43
## Prymnesiophyceae                    5180            165                43
## Mamiellophyceae                     3173            102                40
## Pelagophyceae                       2206             23                42
## Prasinophyceae_clade-VII            1367             30                39
## Dictyochophyceae                     622             48                43
## Chrysophyceae                        599            134                42
## Cryptophyceae                        380             57                32
## Bacillariophyceae                    287            125                38
## Chlorarachniophyceae                 136             16                35
## Bolidophyceae                        132             21                28
## Pyramimonadaceae                     111             14                36
## other_Prasinophyceae                  93             19                30
## Eustigmatophyceae                     88             19                22
## Prasinophyceae_clade-IX               53             13                16
## Pavlovophyceae                        51              5                26
## Trebouxiophyceae                      39             30                17
## Chlorophyceae                         25             20                13
## Nephroselmidophyceae                  19              7                11
## Xanthophyceae                          5              4                 1
## Rhodophyceae                           4              4                 3
## Ulvophyceae                            4              3                 1
## Phaeophyceae                           2              2                 1

1.6.1.1) Relative values

##   reads_per_class    OTUs_per_class samples_per_class 
##               100               100              1400
##                          reads_per_class OTUs_per_class samples_per_class
## Dinophyceae                 27.225522992     54.8031496        100.000000
## Prymnesiophyceae            25.862499376      8.6614173        100.000000
## Mamiellophyceae             15.842029058      5.3543307         93.023256
## Pelagophyceae               11.014029657      1.2073491         97.674419
## Prasinophyceae_clade-VII     6.825103600      1.5748031         90.697674
## Dictyochophyceae             3.105497029      2.5196850        100.000000
## Chrysophyceae                2.990663538      7.0341207         97.674419
## Cryptophyceae                1.897248989      2.9921260         74.418605
## Bacillariophyceae            1.432922263      6.5616798         88.372093
## Chlorarachniophyceae         0.679015428      0.8398950         81.395349
## Bolidophyceae                0.659044386      1.1023622         65.116279
## Pyramimonadaceae             0.554196415      0.7349081         83.720930
## other_Prasinophyceae         0.464326726      0.9973753         69.767442
## Eustigmatophyceae            0.439362924      0.9973753         51.162791
## Prasinophyceae_clade-IX      0.264616306      0.6824147         37.209302
## Pavlovophyceae               0.254630785      0.2624672         60.465116
## Trebouxiophyceae             0.194717659      1.5748031         39.534884
## Chlorophyceae                0.124819012      1.0498688         30.232558
## Nephroselmidophyceae         0.094862449      0.3674541         25.581395
## Xanthophyceae                0.024963802      0.2099738          2.325581
## Rhodophyceae                 0.019971042      0.2099738          6.976744
## Ulvophyceae                  0.019971042      0.1574803          2.325581
## Phaeophyceae                 0.009985521      0.1049869          2.325581

Reads per class vs. OTUs per class [PLOT DESCRIPTION]

## Warning in trans$transform(limits): NaNs produced

## Warning in trans$transform(limits): NaNs produced

## Warning in trans$transform(limits): NaNs produced

1.6.2) Non-normalized data

## [1] 8642   46
##   Row.names TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## 1     OTU_1                  0                  0                  0
## 2    OTU_10                  0                  3                  2
## 3   OTU_100                  0                  0                  0
## 4  OTU_1000                  0                  0                  0
## 5  OTU_1001                  0                  1                  0
##   TARA_111_SUR_0d2_3
## 1                  0
## 2                  9
## 3                  0
## 4                  0
## 5                  2
## [1] 8642   45
##          TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_1                     0                  0                  0
## OTU_10                    0                  3                  2
## OTU_100                   0                  0                  0
## OTU_1000                  0                  0                  0
## OTU_1001                  0                  1                  0
##          TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_1                     0                  0
## OTU_10                    9                  2
## OTU_100                   0                  3
## OTU_1000                  0                  2
## OTU_1001                  2                 15
## [1] 8642   46
##       TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_1                  0                  0                  0
## OTU_2                  0                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  0                  0
## OTU_5                  0                  0                  0
##       TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  1                  0
## OTU_4                  0                  0
## OTU_5                  0                  0
## [1] 2973   46
##        TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_14                  0                  0                  0
## OTU_19                  0                  0                  0
## OTU_29                  0                  1                  0
## OTU_34                  0                  0                  0
## OTU_72                  0                  0                  0
##        TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_14                  0                  0
## OTU_19                  0                  0
## OTU_29                  0                  0
## OTU_34                  0                  0
## OTU_72                  0                  2
##        TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_14                  0                  0                  0
## OTU_19                  0                  0                  0
## OTU_29                  0                  1                  0
## OTU_34                  0                  0                  0
## OTU_72                  0                  0                  0
##        TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_14                  0                  0
## OTU_19                  0                  0
## OTU_29                  0                  0
## OTU_34                  0                  0
## OTU_72                  0                  2
## [1] 1500   43
## [1] 43
## [1] 41
## [1] 43
## [1] 43
## [1] 43
## [1] 39
## [1] 42
## [1] 41
## [1] 43
## [1] 0
## [1] 43
## [1] 41
## [1] 42
## [1] 26
## [1] 3
## [1] 0
## [1] 26
## [1] 4
## [1] 1
## [1] 14
## [1] 0
## [1] 2
## [1] 19
## [1] 37
## [1] 13
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 42
## [1] 29
## [1] 33
##                         Group.1     x
## 19             Prymnesiophyceae 34758
## 8                   Dinophyceae 23637
## 11              Mamiellophyceae 15846
## 14                Pelagophyceae  9306
## 18     Prasinophyceae_clade-VII  3575
## 5                 Chrysophyceae  2824
## 7              Dictyochophyceae  2819
## 1             Bacillariophyceae  2664
## 6                 Cryptophyceae  1922
## 2                 Bolidophyceae   509
## 20             Pyramimonadaceae   507
## 25         other_Prasinophyceae   492
## 9             Eustigmatophyceae   359
## 3          Chlorarachniophyceae   327
## 13               Pavlovophyceae   228
## 17      Prasinophyceae_clade-IX   155
## 22             Trebouxiophyceae   121
## 4                 Chlorophyceae    79
## 12         Nephroselmidophyceae    64
## 24                Xanthophyceae    21
## 21                 Rhodophyceae    16
## 23                  Ulvophyceae     6
## 15                 Phaeophyceae     5
## 10 IncertaeSedis_Archaeplastida     1
## 16          Phaeothamniophyceae     1
##                         Group.1    x
## 8                   Dinophyceae 1500
## 1             Bacillariophyceae  393
## 5                 Chrysophyceae  252
## 19             Prymnesiophyceae  179
## 11              Mamiellophyceae  117
## 6                 Cryptophyceae  107
## 22             Trebouxiophyceae   60
## 7              Dictyochophyceae   53
## 4                 Chlorophyceae   52
## 18     Prasinophyceae_clade-VII   38
## 9             Eustigmatophyceae   27
## 14                Pelagophyceae   26
## 25         other_Prasinophyceae   26
## 2                 Bolidophyceae   25
## 3          Chlorarachniophyceae   21
## 20             Pyramimonadaceae   20
## 17      Prasinophyceae_clade-IX   17
## 21                 Rhodophyceae   15
## 13               Pavlovophyceae   14
## 24                Xanthophyceae   12
## 12         Nephroselmidophyceae    7
## 23                  Ulvophyceae    5
## 15                 Phaeophyceae    4
## 10 IncertaeSedis_Archaeplastida    2
## 16          Phaeothamniophyceae    1
##                      reads_per_class OTUs_per_class
## Bacillariophyceae               2664            393
## Bolidophyceae                    509             25
## Chlorarachniophyceae             327             21
## Chlorophyceae                     79             52
## Chrysophyceae                   2824            252
##                              reads_per_class OTUs_per_class
## Prymnesiophyceae                       34758            179
## Dinophyceae                            23637           1500
## Mamiellophyceae                        15846            117
## Pelagophyceae                           9306             26
## Prasinophyceae_clade-VII                3575             38
## Chrysophyceae                           2824            252
## Dictyochophyceae                        2819             53
## Bacillariophyceae                       2664            393
## Cryptophyceae                           1922            107
## Bolidophyceae                            509             25
## Pyramimonadaceae                         507             20
## other_Prasinophyceae                     492             26
## Eustigmatophyceae                        359             27
## Chlorarachniophyceae                     327             21
## Pavlovophyceae                           228             14
## Prasinophyceae_clade-IX                  155             17
## Trebouxiophyceae                         121             60
## Chlorophyceae                             79             52
## Nephroselmidophyceae                      64              7
## Xanthophyceae                             21             12
## Rhodophyceae                              16             15
## Ulvophyceae                                6              5
## Phaeophyceae                               5              4
## IncertaeSedis_Archaeplastida               1              2
## Phaeothamniophyceae                        1              1
##                              samples_per_class
## Prymnesiophyceae                            43
## Dinophyceae                                 43
## Mamiellophyceae                             43
## Pelagophyceae                              113
## Prasinophyceae_clade-VII                   116
## Chrysophyceae                              103
## Dictyochophyceae                           112
## Bacillariophyceae                          112
## Cryptophyceae                              108
## Bolidophyceae                              111
## Pyramimonadaceae                            97
## other_Prasinophyceae                        65
## Eustigmatophyceae                           35
## Chlorarachniophyceae                        75
## Pavlovophyceae                              75
## Prasinophyceae_clade-IX                     50
## Trebouxiophyceae                            18
## Chlorophyceae                                1
## Nephroselmidophyceae                         1
## Xanthophyceae                                1
## Rhodophyceae                                 1
## Ulvophyceae                                  1
## Phaeophyceae                                 1
## IncertaeSedis_Archaeplastida                 1
## Phaeothamniophyceae                          1
##   reads_per_class    OTUs_per_class samples_per_class 
##           100.000           100.000          3086.047
##                              reads_per_class OTUs_per_class
## Prymnesiophyceae                3.467409e+01     6.02085436
## Dinophyceae                     2.357994e+01    50.45408678
## Mamiellophyceae                 1.580775e+01     3.93541877
## Pelagophyceae                   9.283534e+00     0.87453750
## Prasinophyceae_clade-VII        3.566369e+00     1.27817020
## Chrysophyceae                   2.817182e+00     8.47628658
## Dictyochophyceae                2.812194e+00     1.78271107
## Bacillariophyceae               2.657569e+00    13.21897074
## Cryptophyceae                   1.917360e+00     3.59905819
## Bolidophyceae                   5.077712e-01     0.84090145
## Pyramimonadaceae                5.057760e-01     0.67272116
## other_Prasinophyceae            4.908122e-01     0.87453750
## Eustigmatophyceae               3.581333e-01     0.90817356
## Chlorarachniophyceae            3.262106e-01     0.70635721
## Pavlovophyceae                  2.274496e-01     0.47090481
## Prasinophyceae_clade-IX         1.546258e-01     0.57181298
## Trebouxiophyceae                1.207079e-01     2.01816347
## Chlorophyceae                   7.880928e-02     1.74907501
## Nephroselmidophyceae            6.384549e-02     0.23545240
## Xanthophyceae                   2.094930e-02     0.40363269
## Rhodophyceae                    1.596137e-02     0.50454087
## Ulvophyceae                     5.985515e-03     0.16818029
## Phaeophyceae                    4.987929e-03     0.13454423
## IncertaeSedis_Archaeplastida    9.975858e-04     0.06727212
## Phaeothamniophyceae             9.975858e-04     0.03363606
##                              samples_per_class
## Prymnesiophyceae                    100.000000
## Dinophyceae                         100.000000
## Mamiellophyceae                     100.000000
## Pelagophyceae                       262.790698
## Prasinophyceae_clade-VII            269.767442
## Chrysophyceae                       239.534884
## Dictyochophyceae                    260.465116
## Bacillariophyceae                   260.465116
## Cryptophyceae                       251.162791
## Bolidophyceae                       258.139535
## Pyramimonadaceae                    225.581395
## other_Prasinophyceae                151.162791
## Eustigmatophyceae                    81.395349
## Chlorarachniophyceae                174.418605
## Pavlovophyceae                      174.418605
## Prasinophyceae_clade-IX             116.279070
## Trebouxiophyceae                     41.860465
## Chlorophyceae                         2.325581
## Nephroselmidophyceae                  2.325581
## Xanthophyceae                         2.325581
## Rhodophyceae                          2.325581
## Ulvophyceae                           2.325581
## Phaeophyceae                          2.325581
## IncertaeSedis_Archaeplastida          2.325581
## Phaeothamniophyceae                   2.325581

OTUs per class vs. samples in which they occur [PLOT DESCRIPTION]

## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_text).

Reads per class vs. OTUs per class [PLOT DESCRIPTION]

## Warning in trans$transform(limits): NaNs produced

1) 16S amplicons

1.1) Data overview

Let’s read the dataset and remove the samples containing less than 36155 reads:

## [1] 28294    59
##   OTU_no TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## 1  OTU_1                  0                  0                  1
## 2  OTU_2                  1                  0                  0
## 3  OTU_3                  0                  0                  0
## 4  OTU_4                  0                  2                  3
## 5  OTU_5                  0                  0                  0
##   TARA_110_SUR_0d2_3
## 1                  0
## 2                  0
## 3                  0
## 4                  1
## 5                  0
## [1] 28294    58
##       TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## OTU_1                  0                  0                  1
## OTU_2                  1                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  2                  3
## OTU_5                  0                  0                  0
##       TARA_110_SUR_0d2_3 TARA_111_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  0                  0
## OTU_4                  1                  5
## OTU_5                  0                  0
## [1] 28294     3
##                                                                                                                                                                                 OTUId
## OTU_1                                                               SILVA.v123_JN018772.1.1342_Bacteria;Proteobacteria;Alphaproteobacteria;SAR11_clade;Surface_1;uncultured_bacterium
## OTU_2                                                           SILVA.v123_GU474892.2198.3716_Bacteria;Marinimicrobia__SAR406_clade_;uncultured_Marinimicrobia_bacterium_HF4000_22B16
## OTU_3                SILVA.v123_KC899221.1.1360_Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales;Pseudoalteromonadaceae;Pseudoalteromonas;uncultured_Pseudoalteromonas_sp.
## OTU_4 SILVA.v123_HQ242598.1.1472_Bacteria;Proteobacteria;Gammaproteobacteria;Thiotrichales;Thiotrichales_Incertae_Sedis;Candidatus_Endoecteinascidia;uncultured_gamma_proteobacterium
## OTU_5                                                                                   SILVA.v123_GQ337196.1.1462_Bacteria;Chloroflexi;SAR202_clade;uncultured_Chloroflexi_bacterium
##              class_A        class_B
## OTU_1 other_bacteria other_bacteria
## OTU_2 other_bacteria other_bacteria
## OTU_3 other_bacteria other_bacteria
## OTU_4 other_bacteria other_bacteria
## OTU_5 other_bacteria other_bacteria
## [1] 28294    55
##       TARA_100_SUR_0d2_3 TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3
## OTU_1                  0                  0                  1
## OTU_2                  1                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  0                  2                  3
## OTU_5                  0                  0                  0
##       TARA_110_SUR_0d2_3 TARA_111_SUR_0d2_3
## OTU_1                  0                  0
## OTU_2                  0                  0
## OTU_3                  0                  0
## OTU_4                  1                  5
## OTU_5                  0                  0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13690   50950   83020   77920  100900  158900
##            MP1857            MP2243            MP1176            MP1421 
##             13690             17749             21609             26962 
## TARA_70_SUR_0d2_3 TARA_67_SUR_0d2_3 
##             28965             33399
## [1] 28294    49
## [1] 28294    43
## [1] 28294     0

Table dimensions and content outline:

## [1] 28294    43
##       TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3
## OTU_1                  0                  1                  0
## OTU_2                  0                  0                  0
## OTU_3                  0                  0                  0
## OTU_4                  2                  3                  1
## OTU_5                  0                  0                  0
##       TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3
## OTU_1                  0                  1
## OTU_2                  0                  0
## OTU_3                  0                  0
## OTU_4                  5                  2
## OTU_5                  0                  0

Minimum number of reads per station:

min(colSums(tb16_tax_occur_min36155)) 
## [1] 36155

Maximum number of reads per station:

max(colSums(tb16_tax_occur_min36155)) 
## [1] 158933

Identification of stations with higher number of reads:

amplicons_per_sample<-colSums(tb16_tax_occur_min36155)
amplicons_per_sample[which(colSums(tb16_tax_occur_min36155)>150000)]
## TARA_64_SUR_0d2_3 TARA_85_SUR_0d2_3 
##            158933            150654

Overall reads per sample:

1.2) Normalization

Let’s normalize the original dataset by randomly subsampling 36155 reads in each station:

tb16_tax_occur_min36155_t<-t(tb16_tax_occur_min36155)
tb16_tax_occur_ss36155<-rrarefy(tb16_tax_occur_min36155_t, 36155)

The normalized table shows the following dimensions and format:

## [1]    43 28294
##                    OTU_1 OTU_2 OTU_3 OTU_4 OTU_5
## TARA_102_SUR_0d2_3     0     0     0     1     0
## TARA_109_SUR_0d2_3     1     0     0     0     0
## TARA_110_SUR_0d2_3     0     0     0     0     0
## TARA_111_SUR_0d2_3     0     0     0     1     0
## TARA_112_SUR_0d2_3     0     0     0     0     0

Its content fits with the expected normalization values (36155 reads per station):

rowSums(tb16_tax_occur_ss36155)
## TARA_102_SUR_0d2_3 TARA_109_SUR_0d2_3 TARA_110_SUR_0d2_3 
##              36155              36155              36155 
## TARA_111_SUR_0d2_3 TARA_112_SUR_0d2_3 TARA_122_SUR_0d2_3 
##              36155              36155              36155 
## TARA_123_SUR_0d2_3 TARA_124_SUR_0d2_3 TARA_125_SUR_0d2_3 
##              36155              36155              36155 
## TARA_128_SUR_0d2_3 TARA_132_SUR_0d2_3 TARA_133_SUR_0d2_3 
##              36155              36155              36155 
## TARA_137_SUR_0d2_3 TARA_138_SUR_0d2_3 TARA_140_SUR_0d2_3 
##              36155              36155              36155 
## TARA_142_SUR_0d2_3 TARA_145_SUR_0d2_3 TARA_146_SUR_0d2_3 
##              36155              36155              36155 
## TARA_148_SUR_0d2_3 TARA_149_SUR_0d2_3 TARA_150_SUR_0d2_3 
##              36155              36155              36155 
## TARA_151_SUR_0d2_3 TARA_152_SUR_0d2_3  TARA_56_SUR_0d2_3 
##              36155              36155              36155 
##  TARA_57_SUR_0d2_3  TARA_62_SUR_0d2_3  TARA_64_SUR_0d2_3 
##              36155              36155              36155 
##  TARA_65_SUR_0d2_3  TARA_66_SUR_0d2_3  TARA_68_SUR_0d2_3 
##              36155              36155              36155 
##  TARA_72_SUR_0d2_3  TARA_76_SUR_0d2_3  TARA_78_SUR_0d2_3 
##              36155              36155              36155 
##  TARA_82_SUR_0d2_3  TARA_84_SUR_0d2_3  TARA_85_SUR_0d2_3 
##              36155              36155              36155 
##  TARA_96_SUR_0d2_3  TARA_98_SUR_0d2_3  TARA_99_SUR_0d2_3 
##              36155              36155              36155 
##             MP0311             MP1517             MP1672 
##              36155              36155              36155 
##             MP2821 
##              36155

Let’s check out how many OTUs don’t appear in the new table:

length(which(colSums(tb16_tax_occur_ss36155)==0)) #46
## [1] 8531

There are 8045 OTUs that don’t show any occurrence in the normalized data. Let’s remove them from the table and take a look at its final dimensions:

tb16_tax_occur_ss36155_no_cero<-tb16_tax_occur_ss36155[,-(which(colSums(tb16_tax_occur_ss36155)==0))]
tb16_tax_occur_ss36155_no_cero<-tb16_tax_occur_ss36155_no_cero[mixedorder(row.names(tb16_tax_occur_ss36155_no_cero)),]
dim(tb16_tax_occur_ss36155_no_cero) #92 913
## [1]    43 19763

Datasets summary:

dim(tb16_tax) #959 128
## [1] 28294    58
dim(tb16_tax_occur) #959 122
## [1] 28294    55
dim(tb16_tax_occur_ss36155_no_cero) #92 913
## [1]    43 19763